This script combines all the cleanup that needs done to bring it into one central place. (Had too many cleanup scripts in too many places, prior to this.)
This was (mostly) originally in model2.Rmd.
The data is in three files. The macro(economic) data is something to add to both the test and the train records by joining on the timestamp. It makes more sense to do any cleanup of the macro records before joining, however, so will start with that.
input_dir = '../input'
predict_col <- 'price_doc'
macro <- read.csv(input_filename('macro.csv'))
macro_drop_cols <- c() # expect to find some columns to omit.
macro$timestamp <- as.POSIXct(as.character(macro$timestamp))
qplot(timestamp, oil_urals, data=macro, geom='line')
qplot(timestamp, gdp_quart, data=macro, geom='line')
## Warning: Removed 90 rows containing missing values (geom_path).
qplot(timestamp, cpi, data=macro, geom='line')
## Warning: Removed 31 rows containing missing values (geom_path).
qplot(timestamp, ppi, data=macro, geom='line')
## Warning: Removed 31 rows containing missing values (geom_path).
qplot(timestamp, gdp_deflator, data=macro, geom='line')
## Warning: Removed 365 rows containing missing values (geom_path).
qplot(timestamp, balance_trade, data=macro, geom='line')
## Warning: Removed 31 rows containing missing values (geom_path).
qplot(timestamp, usdrub, data=macro, geom='line')
## Warning: Removed 3 rows containing missing values (geom_path).
qplot(timestamp, eurrub, data=macro, geom='line')
## Warning: Removed 3 rows containing missing values (geom_path).
qplot(timestamp, brent, data=macro, geom='line')
## Warning: Removed 3 rows containing missing values (geom_path).
qplot(timestamp, net_capital_export, data=macro, geom='line')
## Warning: Removed 396 rows containing missing values (geom_path).
qplot(timestamp, gdp_annual, data=macro, geom='line')
qplot(timestamp, gdp_annual_growth, data=macro, geom='line')
qplot(timestamp, average_provision_of_build_contract, data=macro, geom='line')
qplot(timestamp, average_provision_of_build_contract_moscow, data=macro, geom='line')
## Warning: Removed 365 rows containing missing values (geom_path).
qplot(timestamp, rts, data=macro, geom='line')
## Warning: Removed 10 rows containing missing values (geom_path).
qplot(timestamp, micex, data=macro, geom='line')
## Warning: Removed 10 rows containing missing values (geom_path).
qplot(timestamp, deposits_value, data=macro, geom='line')
qplot(timestamp, deposits_growth, data=macro, geom='line')
## Warning: Removed 31 rows containing missing values (geom_path).
qplot(timestamp, deposits_rate, data=macro, geom='line')
## Warning: Removed 414 rows containing missing values (geom_path).
qplot(timestamp, mortgage_value, data=macro, geom='line')
qplot(timestamp, mortgage_growth, data=macro, geom='line')
## Warning: Removed 365 rows containing missing values (geom_path).
qplot(timestamp, mortgage_rate, data=macro, geom='line')
qplot(timestamp, grp, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, grp_growth, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).
qplot(timestamp, income_per_cap, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, real_dispos_income_per_cap_growth, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, salary, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, salary_growth, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, fixed_basket, data=macro, geom='line')
qplot(timestamp, retail_trade_turnover, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, retail_trade_turnover_per_cap, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, retail_trade_turnover_growth, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, labor_force, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, unemployment, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, employment, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, invest_fixed_capital_per_cap, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, invest_fixed_assets, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, profitable_enterpr_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, unprofitable_enterpr_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, share_own_revenues, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, overdue_wages_per_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, fin_res_per_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, marriages_per_1000_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, divorce_rate, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, construction_value, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, invest_fixed_assets_phys, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, pop_natural_increase, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, pop_migration, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, pop_total_inc, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, childbirth, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, mortality, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, housing_fund_sqm, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, lodging_sqm_per_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, water_pipes_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
# has just two values. Drop as likely noise
macro_drop_cols <- c(macro_drop_cols, 'water_pipes_share')
qplot(timestamp, baths_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
# has just two values. Drop as likely noise
macro_drop_cols <- c(macro_drop_cols, 'baths_share')
qplot(timestamp, sewerage_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, gas_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, hot_water_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
# has just three values. Drop as likely noise
macro_drop_cols <- c(macro_drop_cols, 'hot_water_share')
qplot(timestamp, electric_stove_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, heating_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
# has just two values. Drop as likely noise
macro_drop_cols <- c(macro_drop_cols, 'heating_share')
qplot(timestamp, old_house_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
# has just two values. Drop as likely noise
macro_drop_cols <- c(macro_drop_cols, 'old_house_share')
qplot(timestamp, average_life_exp, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, infant_mortarity_per_1000_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, perinatal_mort_per_1000_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, incidence_population, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, rent_price_4.room_bus, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).
qplot(timestamp, rent_price_3room_bus, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).
qplot(timestamp, rent_price_2room_bus, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).
qplot(timestamp, rent_price_1room_bus, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).
qplot(timestamp, rent_price_3room_eco, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).
qplot(timestamp, rent_price_2room_eco, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).
# problem with Feb 2013.
qplot(timestamp, rent_price_2room_eco, data=subset(macro, timestamp >='2013-01-01' & timestamp <= '2013-04-01'), geom='line')
qplot(timestamp, rent_price_1room_eco, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).
# problem with May 2013... or what looks like one.
qplot(timestamp, rent_price_1room_eco, data=subset(macro, timestamp >='2013-02-01' & timestamp <= '2013-07-01'), geom='line')
qplot(timestamp, load_of_teachers_preschool_per_teacher, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).
qplot(timestamp, child_on_acc_pre_school, data=macro, geom='line')
# Have a #! level.
qplot(timestamp, load_of_teachers_school_per_teacher, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, students_state_oneshift, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, modern_education_share, data=macro, geom='line')
# three values, mostl NA. drop.
macro_drop_cols <- c(macro_drop_cols, 'modern_education_share')
qplot(timestamp, old_education_build_share, data=macro, geom='line')
# three values, mostl NA. drop.
macro_drop_cols <- c(macro_drop_cols, 'old_education_build_share')
qplot(timestamp, provision_doctors, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, provision_nurse, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, load_on_doctors, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, power_clinics, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, hospital_beds_available_per_cap, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).
qplot(timestamp, hospital_bed_occupancy_per_year, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).
qplot(timestamp, provision_retail_space_sqm, data=macro, geom='line')
## Warning: Removed 1753 rows containing missing values (geom_path).
# two values, drop
macro_drop_cols <- c(macro_drop_cols, 'provision_retail_space_sqm')
qplot(timestamp, provision_retail_space_modern_sqm, data=macro, geom='line')
## Warning: Removed 1754 rows containing missing values (geom_path).
# two values, drop
macro_drop_cols <- c(macro_drop_cols, 'provision_retail_space_modern_sqm')
qplot(timestamp, turnover_catering_per_cap, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, theaters_viewers_per_1000_cap, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).
qplot(timestamp, seats_theather_rfmin_per_100000_cap, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).
qplot(timestamp, museum_visitis_per_100_cap, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).
qplot(timestamp, bandwidth_sports, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, population_reg_sports_share, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).
qplot(timestamp, students_reg_sports_share, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).
qplot(timestamp, apartment_build, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
qplot(timestamp, apartment_fund_sqm, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).
Clean up mortgage value to create a new mortgage value montonic column.
Drop a handful of columns with a very small number of values that look uninteresting.
The rent price room eco data also looks like it has a suspicious value or two in it. Remove those to impute them with something more likely.
# mortgage_value clearly resets every Feb 1.
macro$mortgage_value_montonic <- macro$mortgage_value
jan31_value <- 0
for (year in lubridate::year(min(macro$timestamp)):lubridate::year(max(macro$timestamp))) {
jan31_value <- jan31_value + subset(macro, timestamp==as.POSIXct(sprintf('%d-01-31', year)))$mortgage_value
idx <- which(macro$timestamp >= as.POSIXct(sprintf('%d-02-01', year)) & macro$timestamp < as.POSIXct(sprintf('%d-02-01', year+1)))
macro[idx,'mortgage_value_montonic'] <- jan31_value + macro[idx,'mortgage_value_montonic']
}
# macro_drop_cols <- c('water_pipes_share', 'baths_share', 'hot_water_share', 'heating_share', 'old_house_share', 'modern_education_share', 'old_education_build_share', 'provision_retail_space_sqm', 'provision_retail_space_modern_sqm')
for (col in macro_drop_cols) {
macro[,col] <- NULL
}
idx <- which(macro$rent_price_2room_eco == 0.1)
macro[idx,'rent_price_2room_eco'] <- NA
idx <- which(macro$rent_price_1room_eco == 2.31)
macro[idx,'rent_price_1room_eco'] <- NA
macro[,'child_on_acc_pre_school'] <- as.character(macro[,'child_on_acc_pre_school'])
idx <- which(macro[,'child_on_acc_pre_school'] %in% c('#!'))
macro[idx, 'child_on_acc_pre_school'] <- NA
macro[,'child_on_acc_pre_school'] <- as.numeric(sub(',', '', macro[, 'child_on_acc_pre_school']))
qplot(timestamp, mortgage_value_montonic, data=macro, geom='line')
# confirm we've already removed the near zero variance columns
macro_nzv <- caret::nearZeroVar(macro)
stopifnot(length(macro_nzv) == 0)
Some modeling algorithms deal poorly with missing values, so impute values where needed. Don’t want to lose the NA-ness entirely however, so add a ‘unknown’ level to factors, and a isna column for each numeric column.
macro_cleaned <- fe_add_isna_col_factory(macro)(macro) # don't really need to pipeline this
macro_imputer <- caret::preProcess(macro_cleaned, method=c('bagImpute'))
macro_preproc <- predict(macro_imputer, macro_cleaned)
print(sprintf('after imputation, have %d NAs in macro data', sum(is.na(macro_preproc))))
## [1] "after imputation, have 0 NAs in macro data"
macro_pt_cols <- powerTransformCols(macro_preproc)
macro_pt <- caret::preProcess(macro_preproc[,macro_pt_cols], method='YeoJohnson')
macro_preproc <- cbind(sans_cols(macro_preproc, macro_pt_cols), predict(macro_pt, macro_preproc[,macro_pt_cols]))
write.csv(macro_preproc, 'clean_20170614_macro_preproc.csv', row.names=FALSE)
save(macro_preproc, file='clean_20170614_macro_preproc.Rdata')
Test and train data need to be “fixed” to include the bad address fix, released late in the contest.
Much of the following was originally done in Clean_by_location.Rmd.
train <- read.csv(input_filename('train.csv'), stringsAsFactors=FALSE)
test <- cbind(read.csv(input_filename('test.csv'), stringsAsFactors=FALSE), price_doc=NA)
stopifnot(all(names(test) == names(train)))
overall_data <- rbind(train, test)
rm(train, test)
bad_address_data <- read.csv(input_filename('BAD_ADDRESS_FIX.csv'), stringsAsFactors=FALSE)
overall_data <- fix_bad_addresses(overall_data, bad_address_data)
overall_data <- fix_raw_column_types(overall_data)
For exploring location specific attributes it might make sense to create location id and neigborhood id columns. But in general, these high cardinality features get in the way.
if (FALSE) {
overall_data$location_id <- as.character(openssl::md5(apply(just_cols(overall_data, location_id_features), 1, paste, collapse='|')))
overall_data$neighborhood_id <- as.character(openssl::md5(apply(just_cols(overall_data, neighborhood_id_features), 1, paste, collapse='|')))
}
All the features for a given raion should be the same, so clean those up separately to insure they are imputed consistently, like the macro data.
This was originally done in Raion_Features.Rmd and Raion_Features2.Rmd.
# slim raion data down into one row per raion so it is all imputed consistently.
raion_data <- unique(just_cols(overall_data, c('sub_area', raw_raion_features)))
As mentioned on the forums, the full_all columns are bad.
qplot(raion_popul, full_all, data=raion_data)+geom_abline(slope=1, intercept=0)
Crib from Josep Garriga’s post [https://www.kaggle.com/aralai/full-all-vs-raion-popul/notebook]
raion_data$district <- NA
raion_data[which(raion_data$sub_area %in% c('Krjukovo','Matushkino','Silino','Savelki','Staroe Krjukovo')),"district"]='Górod Zelenograd'
raion_data[which(raion_data$sub_area %in% c(
'Golovinskoe','Koptevo','Vostochnoe Degunino','Dmitrovskoe','Timirjazevskoe',
'Hovrino','Zapadnoe Degunino','Beskudnikovskoe','Ajeroport','Vojkovskoe',
'Savelovskoe','Sokol','Horoshevskoe','Levoberezhnoe','Begovoe','Molzhaninovskoe')),"district"]='Séverny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
'Mar\'ino','Vyhino-Zhulebino','Ljublino','Kuz\'minki','Rjazanskij','Tekstil\'shhiki',
'Lefortovo','Pechatniki','Juzhnoportovoe','Nizhegorodskoe','Kapotnja','Nekrasovka')),"district"]='Yugo-Vostochny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
'Otradnoe','Bibirevo','Severnoe Medvedkovo','Jaroslavskoe','Babushkinskoe','Juzhnoe Medvedkovo',
'Losinoostrovskoe','Lianozovo','Alekseevskoe','Butyrskoe','Mar\'ina Roshha','Ostankinskoe',
'Sviblovo','Altuf\'evskoe','Rostokino','Severnoe','Marfino')),"district"]='Sévero-Vostochny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
'Gol\'janovo','Perovo','Ivanovskoe','Veshnjaki','Bogorodskoe','Novokosino','Izmajlovo',
'Novogireevo','Sokolinaja Gora','Severnoe Izmajlovo','Preobrazhenskoe','Vostochnoe Izmajlovo',
'Kosino-Uhtomskoe','Sokol\'niki','Metrogorodok','Vostochnoe')),"district"]='Vostochny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
'Orehovo-Borisovo Juzhnoe','Birjulevo Vostochnoe','Chertanovo Juzhnoe','Zjablikovo',
'Orehovo-Borisovo Severnoe','Caricyno','Nagatinskij Zaton','Chertanovo Central\'noe',
'Chertanovo Severnoe','Brateevo','Danilovskoe','Birjulevo Zapadnoe','Nagornoe',
'Nagatino-Sadovniki','Moskvorech\'e-Saburovo','Donskoe')),"district"]='Yuzhny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
'Poselenie Shherbinka','Poselenie Desjonovskoe','Poselenie Vnukovskoe','Poselenie Sosenskoe',
'Poselenie Voskresenskoe','Poselenie Kokoshkino','Poselenie Moskovskij','Poselenie Rjazanovskoe',
'Poselenie Marushkinskoe','Poselenie Filimonkovskoe','Poselenie Mosrentgen')),"district"]='Novomoskovsky administrativny okrug'
raion_data[which(raion_data$sub_area %in% c(
'Troickij okrug','Poselenie Kievskij','Poselenie Voronovskoe','Poselenie Krasnopahorskoe',
'Poselenie Novofedorovskoe','Poselenie Mihajlovo-Jarcevskoe','Poselenie Rogovskoe',
'Poselenie Pervomajskoe','Poselenie Klenovskoe','Poselenie Shhapovskoe')),"district"]='Troitsky administrativny okrug'
raion_data[which(raion_data$sub_area %in% c(
'Juzhnoe Butovo','Jasenevo','Kon\'kovo','Teplyj Stan','Zjuzino','Akademicheskoe',
'Cheremushki','Severnoe Butovo','Lomonosovskoe','Obruchevskoe','Gagarinskoe','Kotlovka')),"district"]='Yugo-Západny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
'Kuncevo','Mozhajskoe','Ramenki','Ochakovo-Matveevskoe','Solncevo','Troparevo-Nikulino',
'Fili Davydkovo','Novo-Peredelkino','Filevskij Park','Krylatskoe','Dorogomilovo',
'Prospekt Vernadskogo','Vnukovo')),"district"]='Západny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
'Mitino','Horoshevo-Mnevniki','Severnoe Tushino','Strogino','Shhukino',
'Juzhnoe Tushino','Pokrovskoe Streshnevo','Kurkino')),"district"]='Sévero-Západny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
'Presnenskoe','Taganskoe','Basmannoe','Hamovniki','Tverskoe', 'Meshhanskoe',
'Zamoskvorech\'e','Krasnosel\'skoe','Arbat','Jakimanka')),"district"]='Tsentralny administrativny ókrug'
raion_data$district <- factor(raion_data$district)
raion_data$raion_popul_male <- with(raion_data, young_male+work_male+ekder_male)
raion_data$raion_popul_female <- with(raion_data, young_female+work_female+ekder_female)
district <- raion_data %>%
group_by(district) %>%
summarize(full_all=sum(raion_popul),
male_f=sum(raion_popul_male),
female_f=sum(raion_popul_female))
raion_data$full_all <- NULL
raion_data$male_f <- NULL
raion_data$female_f <- NULL
raion_data <- merge(raion_data, district, by='district')
for (col in names(raion_data)[-1]) {
x <- raion_data[,col]
e <- ecdf(x)
p <- qplot(x, e(x))+ggtitle(sprintf("Distribution of %s across raions", col))
print(p)
}
## Warning: Removed 22 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 63 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
If there were columns where most values were NA and only a few non-NAs, drop those columns. (In fact, there are no such columns.)
non_NAs <- sapply(names(raion_data), function(col) { sum(!is.na(raion_data[,col])) })
raion_data <- sans_cols(raion_data, names(raion_data)[non_NAs < 4])
raion_trainers <- c(
add_is_na=fe_add_isna_col_factory,
impute=na_impute_factory
)
raion_trainers <- c(raion_trainers, misc_fe=function(train_data) {
return(function(data)
{
data$pop_per_kmsq <- with(data, raion_popul / (area_m / 1000 / 1000))
data$children_preschool_perpop <- with(data, children_preschool / raion_popul)
data$children_school_perpop <- with(data, children_school / raion_popul)
data$children_preschool_per_quota <- with(data, inf_to_2max(children_preschool / preschool_quota))
data$children_school_per_quota <- with(data, inf_to_2max(children_school / school_quota))
data$children_per_school <- with(data, inf_to_2max(children_school / school_education_centers_raion))
data$top_20_schools_pct <- with(data, inf_to_2max(ifelse(school_education_centers_top_20_raion > 0, school_education_centers_top_20_raion / school_education_centers_raion, 0)))
return(data)
})
})
raion_trainers <- c(raion_trainers, population_fe=function(train_data) {
raion_pop_cols <- setdiff(grep('_all|_male|_female', names(train_data), value=TRUE), 'full_all')
return (fe_ratios_factory('raion_popul', raion_pop_cols))
})
raion_trainers <- c(raion_trainers, building_materials_fe=function(train_data) {
raion_building_material_cols <- c(
"build_count_block",
"build_count_wood",
"build_count_frame",
"build_count_brick",
"build_count_monolith",
"build_count_panel",
"build_count_foam",
"build_count_slag",
"build_count_mix"
)
return(fe_ratios_factory('raion_build_count_with_material_info', raion_building_material_cols))
})
raion_trainers <- c(raion_trainers, building_years_fe=function(train_data) {
raion_building_year_cols <- c(
"build_count_before_1920",
"build_count_1921.1945",
"build_count_1946.1970",
"build_count_1971.1995",
"build_count_after_1995"
)
return (fe_ratios_factory('raion_build_count_with_builddate_info', raion_building_year_cols))
})
Apply all of those transformations and save the result.
# district is a problem.
# temporarily remove
dr <- raion_data[,c('sub_area', 'district')]
raion_data_fe <- transform_data(raion_trainers, sans_cols(raion_data, 'district'), verbose=TRUE)
## [1] "applying transform 1, add_is_na to train"
## [1] "after transform class(data)" "data.frame"
## [3] "dim(data)" "147"
## [5] "93" "NAs"
## [7] "346"
## [1] "applying transform 2, impute to train"
## [1] "imputing on"
## [2] "sub_area"
## [3] "area_m"
## [4] "raion_popul"
## [5] "green_zone_part"
## [6] "indust_part"
## [7] "children_preschool"
## [8] "preschool_quota"
## [9] "preschool_education_centers_raion"
## [10] "children_school"
## [11] "school_quota"
## [12] "school_education_centers_raion"
## [13] "school_education_centers_top_20_raion"
## [14] "hospital_beds_raion"
## [15] "healthcare_centers_raion"
## [16] "university_top_20_raion"
## [17] "sport_objects_raion"
## [18] "additional_education_raion"
## [19] "culture_objects_top_25"
## [20] "culture_objects_top_25_raion"
## [21] "shopping_centers_raion"
## [22] "office_raion"
## [23] "thermal_power_plant_raion"
## [24] "incineration_raion"
## [25] "oil_chemistry_raion"
## [26] "radiation_raion"
## [27] "railroad_terminal_raion"
## [28] "big_market_raion"
## [29] "nuclear_reactor_raion"
## [30] "detention_facility_raion"
## [31] "young_all"
## [32] "young_male"
## [33] "young_female"
## [34] "work_all"
## [35] "work_male"
## [36] "work_female"
## [37] "ekder_all"
## [38] "ekder_male"
## [39] "ekder_female"
## [40] "X0_6_all"
## [41] "X0_6_male"
## [42] "X0_6_female"
## [43] "X7_14_all"
## [44] "X7_14_male"
## [45] "X7_14_female"
## [46] "X0_17_all"
## [47] "X0_17_male"
## [48] "X0_17_female"
## [49] "X16_29_all"
## [50] "X16_29_male"
## [51] "X16_29_female"
## [52] "X0_13_all"
## [53] "X0_13_male"
## [54] "X0_13_female"
## [55] "raion_build_count_with_material_info"
## [56] "build_count_block"
## [57] "build_count_wood"
## [58] "build_count_frame"
## [59] "build_count_brick"
## [60] "build_count_monolith"
## [61] "build_count_panel"
## [62] "build_count_foam"
## [63] "build_count_slag"
## [64] "build_count_mix"
## [65] "raion_build_count_with_builddate_info"
## [66] "build_count_before_1920"
## [67] "build_count_1921.1945"
## [68] "build_count_1946.1970"
## [69] "build_count_1971.1995"
## [70] "build_count_after_1995"
## [71] "raion_popul_male"
## [72] "raion_popul_female"
## [73] "full_all"
## [74] "male_f"
## [75] "female_f"
## [76] "preschool_quota_isna"
## [77] "school_quota_isna"
## [78] "hospital_beds_raion_isna"
## [79] "raion_build_count_with_material_info_isna"
## [80] "build_count_block_isna"
## [81] "build_count_wood_isna"
## [82] "build_count_frame_isna"
## [83] "build_count_brick_isna"
## [84] "build_count_monolith_isna"
## [85] "build_count_panel_isna"
## [86] "build_count_foam_isna"
## [87] "build_count_slag_isna"
## [88] "build_count_mix_isna"
## [89] "raion_build_count_with_builddate_info_isna"
## [90] "build_count_before_1920_isna"
## [91] "build_count_1921.1945_isna"
## [92] "build_count_1946.1970_isna"
## [93] "build_count_1971.1995_isna"
## [94] "build_count_after_1995_isna"
## [1] "imputer is null?" "FALSE"
## [1] "imputer class" "preProcess"
## [1] "there are " "346" "NAs in the train data"
## [1] "in imputation, predict_data has cols"
## [2] "sub_area"
## [3] "area_m"
## [4] "raion_popul"
## [5] "green_zone_part"
## [6] "indust_part"
## [7] "children_preschool"
## [8] "preschool_quota"
## [9] "preschool_education_centers_raion"
## [10] "children_school"
## [11] "school_quota"
## [12] "school_education_centers_raion"
## [13] "school_education_centers_top_20_raion"
## [14] "hospital_beds_raion"
## [15] "healthcare_centers_raion"
## [16] "university_top_20_raion"
## [17] "sport_objects_raion"
## [18] "additional_education_raion"
## [19] "culture_objects_top_25"
## [20] "culture_objects_top_25_raion"
## [21] "shopping_centers_raion"
## [22] "office_raion"
## [23] "thermal_power_plant_raion"
## [24] "incineration_raion"
## [25] "oil_chemistry_raion"
## [26] "radiation_raion"
## [27] "railroad_terminal_raion"
## [28] "big_market_raion"
## [29] "nuclear_reactor_raion"
## [30] "detention_facility_raion"
## [31] "young_all"
## [32] "young_male"
## [33] "young_female"
## [34] "work_all"
## [35] "work_male"
## [36] "work_female"
## [37] "ekder_all"
## [38] "ekder_male"
## [39] "ekder_female"
## [40] "X0_6_all"
## [41] "X0_6_male"
## [42] "X0_6_female"
## [43] "X7_14_all"
## [44] "X7_14_male"
## [45] "X7_14_female"
## [46] "X0_17_all"
## [47] "X0_17_male"
## [48] "X0_17_female"
## [49] "X16_29_all"
## [50] "X16_29_male"
## [51] "X16_29_female"
## [52] "X0_13_all"
## [53] "X0_13_male"
## [54] "X0_13_female"
## [55] "raion_build_count_with_material_info"
## [56] "build_count_block"
## [57] "build_count_wood"
## [58] "build_count_frame"
## [59] "build_count_brick"
## [60] "build_count_monolith"
## [61] "build_count_panel"
## [62] "build_count_foam"
## [63] "build_count_slag"
## [64] "build_count_mix"
## [65] "raion_build_count_with_builddate_info"
## [66] "build_count_before_1920"
## [67] "build_count_1921.1945"
## [68] "build_count_1946.1970"
## [69] "build_count_1971.1995"
## [70] "build_count_after_1995"
## [71] "raion_popul_male"
## [72] "raion_popul_female"
## [73] "full_all"
## [74] "male_f"
## [75] "female_f"
## [76] "preschool_quota_isna"
## [77] "school_quota_isna"
## [78] "hospital_beds_raion_isna"
## [79] "raion_build_count_with_material_info_isna"
## [80] "build_count_block_isna"
## [81] "build_count_wood_isna"
## [82] "build_count_frame_isna"
## [83] "build_count_brick_isna"
## [84] "build_count_monolith_isna"
## [85] "build_count_panel_isna"
## [86] "build_count_foam_isna"
## [87] "build_count_slag_isna"
## [88] "build_count_mix_isna"
## [89] "raion_build_count_with_builddate_info_isna"
## [90] "build_count_before_1920_isna"
## [91] "build_count_1921.1945_isna"
## [92] "build_count_1946.1970_isna"
## [93] "build_count_1971.1995_isna"
## [94] "build_count_after_1995_isna"
## [1] 147 93
## [1] "there are " "346"
## [3] "NAs in the predict data"
## [1] "done predicting. Have " "0"
## [3] "NAs in the results"
## [1] "after transform class(data)" "data.frame"
## [3] "dim(data)" "147"
## [5] "93" "NAs"
## [7] "0"
## [1] "applying transform 3, misc_fe to train"
## [1] "after transform class(data)" "data.frame"
## [3] "dim(data)" "147"
## [5] "100" "NAs"
## [7] "0"
## [1] "applying transform 4, population_fe to train"
## [1] "after transform class(data)" "data.frame"
## [3] "dim(data)" "147"
## [5] "126" "NAs"
## [7] "0"
## [1] "applying transform 5, building_materials_fe to train"
## [1] "after transform class(data)" "data.frame"
## [3] "dim(data)" "147"
## [5] "135" "NAs"
## [7] "0"
## [1] "applying transform 6, building_years_fe to train"
## [1] "after transform class(data)" "data.frame"
## [3] "dim(data)" "147"
## [5] "140" "NAs"
## [7] "0"
raion_data_fe <- merge(raion_data_fe, dr, by='sub_area')
raion_data_fe$district <- factor(raion_data_fe$district)
save(raion_data_fe, file='clean_20170614_raion_data_fe.Rdata')
write.csv(raion_data_fe, file='clean_20170614_raion_data_fe.csv', row.names=FALSE)
I had spent several days trying to guess locations from the data without using any shape files, by simultaneously estimating the location of each property and each train station, metro station and bus station. Distance from the kremlin is also known, so I put the Kremlin as my origin and modeled properties as just an angle theta from the Kremlin, eliminating half of the proprty coordinates to estimate. This seemed attractive as a pure-math-no-external-data-at-all approach.
In simulation (see kmts.R) and some testing (locations.Rmd), this ran a long time, but did not look like it would reliably converge to a correct solution (modulo rotation and reflection.)
On the forums, Chippy (Nigel Carpenter) opted to use shapefile information and go from the distances to known-location streets, a much less computationally intensive approach.
And he posted his lat, lon coordinates, in addition to his script.
Thanks, Chippy!
Unfortunately, the ‘bad address’ problem presumaly impacts the lat/long cooordinates Chippy posted, so re-do his code here to run on the updated data.
It takes a while (~6 hours), so check to see if results already exist before running anything.
# create raion-specific raw files; save those back to input.
if (!file.exists('../input/Zjuzino_lat_lon.csv')) {
source('chippy_ll_solver.R')
chippy_ll_solver(overall_data)
}
# assemble raion-level input files into a clean form with each ID
if (!file.exists('clean_20170614_lat_lon.csv') |
!file.exists('clean_20170614_lat_lon.Rdata')) {
# assemble raion-level lat and lon into one file.
lst_subarea <- unique(overall_data$sub_area)
lst_subarea <- lst_subarea[order(lst_subarea)] # sort
lat_lon <- NULL
for (sa in lst_subarea) {
lat_lon <- rbind(lat_lon,
read.csv(paste0('../input/', sa, "_lat_lon.csv"), stringsAsFactors=FALSE),
stringsAsFactors=FALSE)
}
# save location counts for map below.
location_counts <- lat_lon
write.csv(location_counts, 'clean_20170614_lat_lon_counts.csv', row.names=FALSE)
save(location_counts, file='clean_20170614_lat_lon_counts.Rdata')
# merge lat_lon with the row IDs.
overall_data$key <- with(overall_data, paste(mkad_km, ttk_km, sadovoe_km, sub_area, sep=":"))
lat_lon <- merge(just_cols(overall_data, c(id_col, 'timestamp', 'key')),
just_cols(lat_lon, c('key', 'lat', 'lon', 'tolerance_m')),
by='key', all.x=TRUE)
overall_data$key <- NULL
lat_lon <- lat_lon[order(just_col(lat_lon, id_col)),]
write.csv(lat_lon, 'clean_20170614_lat_lon.csv', row.names=FALSE)
save(lat_lon, file='clean_20170614_lat_lon.Rdata')
}
# merge lat, lon into overall_data
load('clean_20170614_lat_lon.Rdata')
overall_data <- merge(overall_data, just_cols(lat_lon, c(id_col, 'lat', 'lon', 'tolerance_m')))
Look at counts.
library(leaflet)
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.2-5, (SVN revision 648)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 1.10.1, released 2013/08/26
## Path to GDAL shared files: /usr/share/gdal/1.10
## Loaded PROJ.4 runtime: Rel. 4.8.0, 6 March 2012, [PJ_VERSION: 480]
## Path to PROJ.4 shared files: (autodetected)
## Linking to sp version: 1.2-4
library(rjson)
load('clean_20170614_lat_lon_counts.Rdata')
print(summary(unique(location_counts$count)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 24.0 50.0 102.6 118.0 978.0
location_counts_ecdf_x <- unique(location_counts$count)
location_counts_ecdf_x <- location_counts_ecdf_x[order(location_counts_ecdf_x)]
location_counts_ecdf_y <- ecdf(location_counts$count)(location_counts_ecdf_x)
qplot(location_counts_ecdf_x, location_counts_ecdf_y) + ggtitle('ECDF of properties at exact same location')
print(just_cols(subset(location_counts, count > 250), c('count', 'sub_area', 'lat', 'lon', 'tolerance_m')))
## count sub_area lat lon tolerance_m
## 6858 255 Mitino 55.84512 37.36375 5
## 7315 260 Nagatinskij Zaton 55.68522 37.69607 5
## 7553 978 Nekrasovka 55.68410 37.92835 5
## 9119 366 Poselenie Desjonovskoe 55.52464 37.37443 5
## 9230 719 Poselenie Moskovskij 55.60371 37.34452 25
## 9466 364 Poselenie Sosenskoe 55.57427 37.49955 25
## 9467 255 Poselenie Sosenskoe 55.57085 37.47434 5
## 9607 340 Poselenie Vnukovskoe 55.63269 37.32573 5
## 9608 331 Poselenie Vnukovskoe 55.63357 37.33098 10
## 9738 285 Poselenie Voskresenskoe 55.52565 37.50405 25
## 9739 260 Poselenie Voskresenskoe 55.52664 37.50568 25
## 11520 297 Solncevo 55.65362 37.40218 5
## 13277 298 Zapadnoe Degunino 55.88029 37.49646 25
pal <- colorNumeric(palette = "RdYlGn", domain = c(0,1000), reverse= TRUE)
shp_moscow_adm <- rgdal::readOGR(dsn = "../input/sberbankmoscowroads", layer = "moscow_adm", verbose=FALSE)
# Make data frame with centroids. I'm sure there is a better way to do this.
raion_centroids <- do.call(rbind.data.frame, lapply(1:146, function(i) {
c(coordinates(rgeos::gCentroid(shp_moscow_adm[i,])), name=as.character(shp_moscow_adm[i,]$RAION))
}))
names(raion_centroids) <- c('x','y','name')
raion_centroids$x <- as.numeric(as.character(raion_centroids$x))
raion_centroids$y <- as.numeric(as.character(raion_centroids$y))
raion_centroids$name <- as.character(raion_centroids$na)
# make a GeoJSON form of the shapefile if we don't have one already.
if(!file.exists('shp_moscow_adm.GeoJSON')) {
rgdal::writeOGR(shp_moscow_adm, dsn='shp_moscow_adm.GeoJSON', driver='GeoJSON', layer=1)
}
gj <- rjson::fromJSON(file='shp_moscow_adm.GeoJSON')
gj_names <- sapply(gj$features, function(f) {f$properties$RAION})
r <- leaflet::leaflet(data=location_counts) %>%
leaflet::addGeoJSON(gj) %>%
leaflet::addLabelOnlyMarkers(lng=raion_centroids$x, lat=raion_centroids$y,
label=raion_centroids$name,
labelOptions=labelOptions(
noHide=TRUE,
textOnly=TRUE)) %>%
leaflet::addCircles(lat=~lat, lng=~lon, col=~pal(count), fill=TRUE, weight=10, label=~paste0("Count: ", count)) %>%
leaflet::addLegend("topright", pal=pal, values=~count, title = "Property count", opacity=0.8)
r # Print the map
Looking at the properties, there are many features that are location-specific, like all the distances, and the “neighborhood” information about nearly cafes et al. There are also many raion-specific features, common across the whole raion.
This leaves fairly few features that are specific to the property itself, and it’s good to look at those:
Have life sq < 5 or full sq < 5 is probably an error. Remove to impute later. Also have full sq < life sq, usually by a lot. Guess that this is a coding error and full is accidentally recorded as the extra.
qplot(life_sq, full_sq, data=overall_data)+scale_x_log10()+scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 7559 rows containing missing values (geom_point).
overall_data$life_sq[overall_data$life_sq < 5] <- NA
overall_data$full_sq[overall_data$full_sq < 5] <- NA
idx <- overall_data$full_sq < overall_data$life_sq
idx <- !is.na(idx) & idx
overall_data$full_sq[idx] <- overall_data$life_sq[idx] + overall_data$full_sq[idx]
Have max floor < floor in 2136 cases. Assume floor is more accurate than max floor.
qplot(floor, max_floor, data=overall_data)
## Warning: Removed 9572 rows containing missing values (geom_point).
idx <- with(overall_data, !is.na(max_floor) & !is.na(floor) & max_floor < floor)
overall_data$max_floor[idx] <- overall_data$floor[idx]
Have only two points, one test and one train, of material==3. Drop.
qplot(material, data=overall_data)
idx <- with(overall_data, material == 3)
overall_data$material[idx] <- NA
Build year spans a huge range, from 0 to 20052009. Assume numbers before 1860 are bad, as is 4965. Convert 20092005 into 2007 as a guess.
One of the discussion pages says that properties are sometimes pre-sold, so allow that.
print(summary(overall_data$build_year))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 1966 1980 2717 2006 20050000 14654
print(xtabs(~subset(overall_data, build_year < 1860 | build_year > 2017)$build_year))
## subset(overall_data, build_year < 1860 | build_year > 2017)$build_year
## 0 1 2 3 20 71 215 1691
## 899 555 1 2 1 1 2 1
## 2018 2019 4965 20052009
## 31 5 1 1
qplot(build_year, data=overall_data)+scale_x_continuous(limits=c(1860, 2020))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 16118 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).
overall_data$build_year[overall_data$build_year < 1860] <- NA
overall_data$build_year[overall_data$build_year == 20052009] <- 2007 # guess
overall_data$build_year[overall_data$build_year == 4965] <- NA
Less than 1 room is probably incorrect.
print(summary(overall_data$num_room))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.000 2.000 1.901 2.000 19.000 9572
qplot(num_room, data=overall_data)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9572 rows containing non-finite values (stat_bin).
overall_data$num_room[overall_data$num_room < 1] <- NA
Have some kitchen sq that look like build years. Use those to guess a build year. Have many that look the same size (or bigger than) the life or full sq. Clear those.
print(summary(overall_data$kitch_sq))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.000 6.000 6.544 9.000 2014.000 9572
qplot(kitch_sq, data=overall_data)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9572 rows containing non-finite values (stat_bin).
idx <- with(overall_data, kitch_sq > 1000 & is.na(build_year))
idx <- !is.na(idx) & idx
overall_data$build_year[idx] <- overall_data$kitch_sq[idx]
idx <- with(overall_data, life_sq & kitch_sq >= life_sq)
idx <- !is.na(idx) & idx
overall_data$kitch_sq[idx] <- NA
idx <- with(overall_data, full_sq & kitch_sq >= full_sq)
idx <- !is.na(idx) & idx
overall_data$kitch_sq[idx] <- NA
We have one state of 33. That’s probably a typo for 3, so replace 33 with 3 and recreate the factor to drop the unused level.
plot(overall_data$state)
overall_data$state[overall_data$state == '33'] <- '3'
overall_data$state <- factor(overall_data$state)
plot(overall_data$product_type)
# create a 'dispostion' column with at least 'train' and 'submission' levels.
overall_data$disposition <- NA
overall_data$disposition[!is.na(overall_data$price_doc)] <- 'train'
overall_data$disposition[is.na(overall_data$price_doc)] <- 'submission'
pipeline <- c(drop_nzv=function(train_data) {
fe_drop_nzv_cols_factory(train_data, keep_cols=c(predict_col, id_col))
})
pipeline <- c(pipeline, impute_property_features=function(train_data) {
# impute property features here. Use those features, plus the lat and lon
property_cols <- c('timestamp', raw_property_features, 'lat', 'lon')
train_property <- just_cols(train_data, c(id_col, property_cols))
print("train for imputing property features with columns....")
print(names(train_property))
print(dim(train_property))
imputer <- na_impute_factory(
train_property, predict_col=predict_col,
drop_col='disposition',
impute_on_just_na_cols=FALSE)
function(data) {
data_property <- just_cols(data, c(id_col, property_cols))
print("apply imputing to property features with columns....")
print(names(data_property))
print(dim(data_property))
data_property <- imputer(data_property)
data <- cbind(sans_cols(data, property_cols), data_property)
return (data)
}
})
Add a ‘age at sale’ column, which can be negative if property pre-sold
Add a ‘floor percentage’ column
library(lubridate)
pipeline <- c(pipeline, add_date_features=function(train_data) {
function(data) {
data$sold_year <- with(data, lubridate::year(timestamp))
data$sold_month <- with(data, lubridate::month(timestamp))
data$sold_ym <- with(data, paste(lubridate::year(timestamp), lubridate::month(timestamp), sep='-'))
data$age_at_sale <- with(data, build_year-lubridate::year(timestamp))
return (data)
}
})
pipeline <- c(pipeline, add_floors_features=function(train_data) {
function(data) {
data$floors_above <- with(data, max_floor - floor)
data$floor_pct <- with(data, floor / max_floor * 100.0)
return(data)
}
})
Scaling is not important in a tree based model, so omit (defer?)
if (FALSE) {
pipeline <- c(pipeline, scale=function(train_data) {
scale_preprocessor_factory(train_data, predict_col=predict_col,
drop_cols=c(predict_col, 'id', 'disposition'))
})
}
# fix the data
overall_data_fe <- transform_data(pipeline,
subset(overall_data, disposition=='train'),
subset(overall_data, disposition!='skip'),
verbose=TRUE,
predict_col=predict_col
)[[2]]
## [1] "applying transform 1, drop_nzv to train"
## [1] "after transform class(data)" "data.frame"
## [3] "dim(data)" "30471"
## [5] "274" "NAs"
## [7] "265340"
## [1] "applying transform 1, drop_nzv to other data 1"
## [1] "after transform class(other_df)" "list"
## [3] "dim(other_df)" "NAs"
## [5] "0"
## [1] "applying transform 2, impute_property_features to train"
## [1] "train for imputing property features with columns...."
## [1] "id" "timestamp" "full_sq" "life_sq"
## [5] "floor" "max_floor" "material" "build_year"
## [9] "num_room" "kitch_sq" "state" "product_type"
## [13] "lat" "lon"
## [1] 30471 14
## [1] "imputing on" "timestamp" "full_sq" "life_sq"
## [5] "floor" "max_floor" "material" "build_year"
## [9] "num_room" "kitch_sq" "state" "product_type"
## [13] "lat" "lon"
## [1] "imputer is null?" "FALSE"
## [1] "imputer class" "preProcess"
## [1] "there are " "73466" "NAs in the train data"
## [1] "apply imputing to property features with columns...."
## [1] "id" "timestamp" "full_sq" "life_sq"
## [5] "floor" "max_floor" "material" "build_year"
## [9] "num_room" "kitch_sq" "state" "product_type"
## [13] "lat" "lon"
## [1] 30471 14
## [1] "in imputation, predict_data has cols"
## [2] "timestamp"
## [3] "full_sq"
## [4] "life_sq"
## [5] "floor"
## [6] "max_floor"
## [7] "material"
## [8] "build_year"
## [9] "num_room"
## [10] "kitch_sq"
## [11] "state"
## [12] "product_type"
## [13] "lat"
## [14] "lon"
## [1] 30471 13
## [1] "there are " "73466"
## [3] "NAs in the predict data"
## [1] "done predicting. Have " "23132"
## [3] "NAs in the results"
## [1] "after transform class(data)" "data.frame"
## [3] "dim(data)" "30471"
## [5] "275" "NAs"
## [7] "215006"
## [1] "applying transform 2, impute_property_features to other data 1"
## [1] "apply imputing to property features with columns...."
## [1] "id" "timestamp" "full_sq" "life_sq"
## [5] "floor" "max_floor" "material" "build_year"
## [9] "num_room" "kitch_sq" "state" "product_type"
## [13] "lat" "lon"
## [1] 38133 14
## [1] "in imputation, predict_data has cols"
## [2] "timestamp"
## [3] "full_sq"
## [4] "life_sq"
## [5] "floor"
## [6] "max_floor"
## [7] "material"
## [8] "build_year"
## [9] "num_room"
## [10] "kitch_sq"
## [11] "state"
## [12] "product_type"
## [13] "lat"
## [14] "lon"
## [1] 38133 13
## [1] "there are " "77343"
## [3] "NAs in the predict data"
## [1] "done predicting. Have " "23860"
## [3] "NAs in the results"
## [1] "after transform class(other_df)" "list"
## [3] "dim(other_df)" "NAs"
## [5] "0"
## [1] "applying transform 3, add_date_features to train"
## [1] "after transform class(data)" "data.frame"
## [3] "dim(data)" "30471"
## [5] "279" "NAs"
## [7] "215006"
## [1] "applying transform 3, add_date_features to other data 1"
## [1] "after transform class(other_df)" "list"
## [3] "dim(other_df)" "NAs"
## [5] "0"
## [1] "applying transform 4, add_floors_features to train"
## [1] "after transform class(data)" "data.frame"
## [3] "dim(data)" "30471"
## [5] "281" "NAs"
## [7] "215006"
## [1] "applying transform 4, add_floors_features to other data 1"
## [1] "after transform class(other_df)" "list"
## [3] "dim(other_df)" "NAs"
## [5] "0"
overall_data_fe <- merge(sans_cols(overall_data_fe, setdiff(names(raion_data_fe), 'sub_area')),
raion_data_fe, by='sub_area')
# reconfirm timestamps are compatible
macro_preproc$timestamp <- as.Date(macro_preproc$timestamp)
overall_data_fe$timestamp <- as.Date(overall_data_fe$timestamp)
overall_data_fe <- merge(overall_data_fe, macro_preproc, by='timestamp')
save(overall_data_fe, file='clean_20170614_overall_data_fe.Rdata')
write.csv(overall_data_fe, file='clean_20170614_overall_data_fe.csv', row.names=FALSE)